We want to look at real data to simulate realistic datasets.

load("Expt4c2b_filtdata.Rda")
load("E4c2b_slingshot_wsforkelly.RData")

# select only 4 clusters : pink, cyan, orange, darkblue
select = c(3, 4, 11, 13)
cc_rev = cc_rev[select]
clus.labels = clus.labels[clus.labels %in% c(3, 4, 10, 12)]
clus.labels = droplevels(clus.labels)

names(batch) <- colnames(counts)
counts <- counts[,names(clus.labels)]
batch <- droplevels(batch[names(clus.labels)])
qc <- qc[names(clus.labels),]
qcpca <- prcomp(qc, center=TRUE, scale.=TRUE)
mod <- model.matrix( ~ qcpca$x[, 1:5])

vars <- rowVars(log1p(counts))
names(vars) <- rownames(counts)
vars <- sort(vars, decreasing = TRUE)
core <- counts[names(vars)[1:1000], ]

detection_rate <- colSums(core > 0)
coverage <- colSums(core)
colMerged <- cc_rev[clus.labels]

We select only the cells that pass QC, color coded by the experimental time point. I will focus on the top 1,000 most variable genes. Note that the data are not public, hence it should not work other than on Davide and Fanny computers.

mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
pal =colorRampPalette(c("black",'black',"red","yellow"), space="rgb")

par(mfrow = c(1,2))
plot(mean, rowMeans(core == 0), xlim = c(1,11), ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              main = '1,000 most variable genes')
smoothScatter(mean, rowMeans(core == 0),xlim = c(1,11),
              nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              colramp = pal, main = '1,000 most variable genes')

Fit data with K = 4, V intercept in both Mu and Pi, commondispersion = FALSE, and X accounting for qc. Notice the warnings.

print(system.time(zinb <- zinbFit(core, ncores = 3, K = 4, X = mod, 
                                  commondispersion = FALSE)))
## Warning in simpute.als(x, J, thresh, lambda, maxit, trace.it, warm.start, :
## Convergence not achieved by 100 iterations
##    user  system elapsed 
## 527.478 207.518 293.303

True W

If we simulate W from real data, it will look like that. I used K = 4, maybe K = 3 would be a better choice to simulate data.

pairs(zinb@W, col = colMerged, pch = 19, main = "ZINB")

Gamma

gamma_mu = zinb@gamma_mu[1,]
gamma_pi = zinb@gamma_pi[1,]

df <- data.frame(gamma_mu = gamma_mu,
                 gamma_pi = gamma_pi,
                 detection_rate = detection_rate,
                 coverage = coverage)
pairs(df, pch=19, col=colMerged)

print(cor(df, method="spearman"))
##                  gamma_mu   gamma_pi detection_rate   coverage
## gamma_mu        1.0000000 -0.3384076      0.5949632  0.7542561
## gamma_pi       -0.3384076  1.0000000     -0.8733322 -0.1700023
## detection_rate  0.5949632 -0.8733322      1.0000000  0.3309894
## coverage        0.7542561 -0.1700023      0.3309894  1.0000000
gamma = data.frame(gamma_mu = gamma_mu, gamma_pi = gamma_pi, celltype = colMerged)
gamma = melt(gamma)

ggplot(gamma, aes(x = value)) + 
  geom_histogram(aes(y = ..density..), bins = 20, col = 'gray') +
  geom_density(col= 'blue', size = .5) + 
  facet_grid(~ variable) + xlab('gamma')

ggplot(gamma, aes(x = variable, y = value)) + xlab('') + theme_bw() +
  geom_boxplot() + coord_flip() + facet_grid(~ variable, scales = 'free') +
  theme_bw() + ylab('gamma') + scale_x_discrete(breaks = c('', ''), drop=FALSE)

ggplot(gamma, aes(value, fill = celltype)) + geom_density(alpha = 0.2) +
  facet_grid(~ variable) + xlab('gamma') + theme_bw()

Beta

Note that we accounting for batch and qc in X, so beta has more than one row (M = 25 rows). Here, we look only at the first row of beta.

beta_mu = zinb@beta_mu[1,]
beta_pi = zinb@beta_pi[1,]

df <- data.frame(beta_mu = beta_mu,
                 beta_pi = beta_pi)
pairs(df, pch=19)

print(cor(df, method="spearman"))
##             beta_mu     beta_pi
## beta_mu  1.00000000 -0.07272988
## beta_pi -0.07272988  1.00000000
beta = data.frame(beta_mu = beta_mu, beta_pi = beta_pi)
beta = melt(beta)
ggplot(beta, aes(x = value)) + 
  geom_histogram(aes(y = ..density..), bins = 100, col = 'gray') +
  geom_density(col= 'blue', size = .5) + 
  facet_grid(~ variable, scales = 'free') + xlab('beta')

ggplot(beta, aes(x = variable, y = value)) + theme_bw() + xlab('') +
  geom_boxplot() + facet_grid(~ variable, scales = 'free') + coord_flip() +
  theme_bw() + ylab('beta') + scale_x_discrete(breaks = c('', ''), drop=FALSE)

# remove outliers
max = max(quantile(beta_pi, 0.1), quantile(beta_mu, 0.1))
min = min(quantile(beta_pi, 0.1), quantile(beta_mu, 0.1))
ggplot(beta, aes(x = variable, y = value)) + theme_bw() + xlab('') +
  geom_boxplot(outlier.shape = NA) +
  facet_grid(~ variable, scales = 'free') + coord_flip() +
  theme_bw() + ylab('beta removing outliers') +
  scale_x_discrete(breaks = c('', ''), drop=FALSE) +
  scale_y_continuous(limits = c(min, max))
## Warning: Removed 1247 rows containing non-finite values (stat_boxplot).

Alpha

pairs(t(zinb@alpha_mu), main = 'alpha_mu')

pairs(t(zinb@alpha_pi), main = 'alpha_pi')

df <- data.frame(alpha_mu_1 = zinb@alpha_mu[1, ],
                 alpha_mu_2 = zinb@alpha_mu[2, ],
                 alpha_mu_3 = zinb@alpha_mu[3, ],
                 alpha_mu_4 = zinb@alpha_mu[4, ],
                 alpha_pi_1 = zinb@alpha_pi[1, ],
                 alpha_pi_2 = zinb@alpha_pi[2, ],
                 alpha_pi_3 = zinb@alpha_pi[3, ],
                 alpha_pi_4 = zinb@alpha_pi[4, ])
pairs(df, pch=19)

print(cor(df, method="spearman"))
##              alpha_mu_1  alpha_mu_2  alpha_mu_3  alpha_mu_4   alpha_pi_1
## alpha_mu_1  1.000000000 -0.01821755 -0.10282729 -0.02862087  0.199990196
## alpha_mu_2 -0.018217554  1.00000000  0.25672718  0.12787712  0.115313683
## alpha_mu_3 -0.102827287  0.25672718  1.00000000 -0.15494610  0.026598495
## alpha_mu_4 -0.028620869  0.12787712 -0.15494610  1.00000000 -0.030037446
## alpha_pi_1  0.199990196  0.11531368  0.02659849 -0.03003745  1.000000000
## alpha_pi_2  0.252274656 -0.35305144  0.01979558 -0.08733802  0.053448173
## alpha_pi_3 -0.050755119 -0.09634999 -0.32138195  0.10866043  0.008684217
## alpha_pi_4  0.009658894 -0.18435567  0.10700202 -0.03672412  0.027262323
##             alpha_pi_2   alpha_pi_3   alpha_pi_4
## alpha_mu_1  0.25227466 -0.050755119  0.009658894
## alpha_mu_2 -0.35305144 -0.096349992 -0.184355668
## alpha_mu_3  0.01979558 -0.321381945  0.107002019
## alpha_mu_4 -0.08733802  0.108660433 -0.036724117
## alpha_pi_1  0.05344817  0.008684217  0.027262323
## alpha_pi_2  1.00000000 -0.076684829  0.171043275
## alpha_pi_3 -0.07668483  1.000000000 -0.047410883
## alpha_pi_4  0.17104328 -0.047410883  1.000000000

Dispersion

Dispersion was estimated on FQ norm counts using function estimateDisp from edgeR. Dispersions estimated using edgeR and ZINB does not seem to be on the same scale. I think it is because I estimate dispersion on (normalized) counts with edgeR, but (if I understood well) zinb estimate the dispersion on log1p(counts). We should rely on values of dispersion from edgeR as we are going to simulates the counts with \[Y_{i,j} \sim NB(\mu_{i,j}, \phi_j)\]

par(mfrow=c(1,1))
set = newSeqExpressionSet(core)
fq = betweenLaneNormalization(set, which = "full", offset = T)
disp = estimateDisp(counts(fq), offset = -offst(fq))
## Design matrix not provided. Switch to the classic mode.
plot(disp$tagwise.dispersion, 1/exp(zinb@zeta), ylab = 'zinb dispersion',
     xlab = 'edgeR tagwise dispersion', main = 'Dispersion')

print(cor(disp$tagwise.dispersion, 1/exp(zinb@zeta), method="spearman"))
## [1] 0.137023
par(mfrow = c(1, 2))
plot(density(1/exp(zinb@zeta)), main = 'zinb dispersion')
plot(density(disp$tagwise.dispersion), main = 'edgeR dispersion')

par(mfrow = c(1, 2))
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
plot(mean, 1/exp(zinb@zeta), main = 'zinb', xlab = 'mean expression', ylab = 'dispersion', ylim = c(0, 14))
plot(mean, disp$tagwise.dispersion, main = 'edgeR',xlab = 'mean expression', ylab = 'dispersion', ylim = c(0, 14))

Estimated mu and pi

par(mfrow=c(1, 2))
detection_rate <- colMeans(core>0)
coverage <- colSums(core)
plot(rowMeans(getPi(zinb)), detection_rate, xlab="Average estimated Pi", ylab="Detection Rate for each cell", pch=19, col=colMerged, ylim = c(0, 1))
plot(rowMeans(log1p(getMu(zinb))), coverage, xlab="Average estimated log Mu", ylab="Coverage", pch=19, col=colMerged)

par(mfrow=c(1, 3))
smoothScatter(mean, rowMeans(core == 0),xlim = c(2,8),
              nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
              xlab = "Mean Expression", ylab = "Dropout Rate", 
              colramp = pal, main = 'True')
smoothScatter(colMeans(log1p(getMu(zinb))), colMeans(getPi(zinb)), nbin = 256,
              nrpoints=Inf, pch="", cex=.7, xlim = c(2,8),
              xlab = "Estimated Mean Expression", main = 'Estimated',
              ylab = "Estimated Dropout Rate",ylim = c(0,1),
              colramp = pal)
plot(colMeans(log1p(getMu(zinb))), colMeans(getPi(zinb)),
     xlab = "Estimated Mean Expression", xlim = c(2,8),
     ylab = "Estimated Dropout Rate", pch=19,
     ylim = c(0, 1), main = 'Estimated')